#data<-read.csv("Data.csv",header = T)
data=read.csv(file.choose())
names(data)
#outlier detection
data_m<-data
data_m$B1_S2<-data$B1_S2-1
data_m$Fund_TNA..k.<-as.numeric(data$Fund_TNA..k.)
#data_m<-data_m[,-11]
data_m<-data_m[,-7]
data_m<-data_m[,-5]
data_m<-data_m[,-3]
data_m<-data_m[,-1]
d0<-data_m[data_m$B1_S2 ==0,]
d1<-data_m[data_m$B1_S2 ==1,]
names(d0)
x1<-d0[,-5]
x1<-as.matrix(x1)
n<-nrow(x1)
p<-ncol(x1)
m<-apply(x1,2,mean)
m<-matrix(m,nr = n,nc = p,byrow = T)

S<-var(x1)
Sinv<-solve(S)
md<-diag((x1-m)%*%Sinv%*%t(x1-m))
plot(md)

#outlier delete
c<-qchisq(0.99,df= p);c
abline(h =c)
d2<-d0[md<c,]
dim(d0)
dim(d2)


#same steps on s1
names(d1)
x2<-d1[,-5]
x2<-as.matrix(x2)
n<-nrow(x2)
p<-ncol(x2)
m<-apply(x2,2,mean)
m<-matrix(m,nr = n,nc = p,byrow = T)

S<-var(x2)
Sinv<-solve(S)
md<-diag((x2-m)%*%Sinv%*%t(x2-m))
plot(md)

#outlier delete
c<-qchisq(0.99,df= p);c
abline(h =c)
d3<-d1[md<c,]
dim(d1)
dim(d3)
d4<-rbind(d2,d3)
dim(d4)

write.csv(d4,file ="Data_cleaned.csv",row.names = F)

#modification
data_m<-d4
summary(data_m)
#independence
cor(data_m[,-5])
pairs(data_m)

#Training data set and testing data set split
set.seed(2015)
n<-nrow(data_m)
id<-sample(1:n,floor(n*0.85)) #85% training data
data_train<-data_m[id,]
data_test<-data_m[-id,]

#Logistic model
m1<-glm(B1_S2~.,data=data_train,family = "binomial")
summary(m1)

nothing<-glm(B1_S2~1,data= data_train,family ="binomial")
bothways = step(nothing,list(lower = formula(nothing),upper = formula(m1)),direction = "both")
formula(bothways)
m11<-glm(formula(bothways),data= data_train,family= "binomial")
data_test1<-data_test[,-7]
pred1<-predict(m11,data_test1[,-5])
m1_classifi<-table(pred1>0.5,data_test1[,5]);m1_classifi

#Bayes Classifier
#install.packages('e1071')
#library("e1071")
#data_train1<-data_train[,-7]
#m2<-naiveBayes(as.factor(B1_S2)~.,data = data_m);m2
#pred2<-predict(m2,data_m;length(pred2)
#m2_classifi<-table(pred2,data_m[,5]);m2_classifi

#knn
library(class)
set.seed(3000)
n1<-length(which(data_train$B1_S2 ==1))
n<-dim(data_train)[1];n
knn.pred<-knn(data_train,data_test,data_train[,5],k = 1)
m2_classifi<-table(knn.pred,data_test[,5])
ratio = m2_classifi[1,1]+m2_classifi[2,2]
k_optimal = 1
for (i in 2:100) {
knn.pred<-knn(data_train,data_test,data_train[,5],k = i)
m2_classifi<-table(knn.pred,data_test[,5])
ratio1 = m2_classifi[1,1]+m2_classifi[2,2]
if(ratio1 > ratio){
	k_optimal <- i
	ratio<-ratio1
	}
}
k_optimal
knn.pred<-knn(data_train,data_test,data_train[,5],k = k_optimal)
m2_classifi<-table(knn.pred,data_test[,5])
m2_classifi

#ann
library("nnet")
set.seed(1908)
ann<-function(x,y,size,maxit=500,linout=F,try=30){ 
	ann1<-nnet(y~.,data=x,size=size,maxit=maxit,linout=linout) 
	v1<-ann1$value # save the value for the first trial 
	for(i in 2:try){ ann<-nnet(y~.,data=x,size=size,maxit=maxit,linout=linout) 
		if(ann$value<v1){ # check if the current value is better 
			v1<-ann$value # save the better value 
			ann1<-ann # save the result 
			} }
			 ann1 # return the results
			  } 

data_train[,5]<-as.factor(data_train[,5]) # transfer to factor
names(data_train) 
m3<-ann(data_train[,-5],data_train[,5],size=3,linout=F)    # save the model with a hidden layer of size 3 
pred3<-predict(m3,data_test[,-5]) # predicted value for the training dataset 
m3_classifi<-table(pred3>0.5,data_test[,5]);m3_classifi # classification table for the training dataset 

#plot ann
install.packages("neuralnet")
require(neuralnet)
names(data_train)
m<- model.matrix(data_train[,5]~ data_train[,1]+data_train[,2]+data_train[,3]+data_train[,4]+data_train[,6]+data_train[,7],data = data_train)
head(m)
nn=neuralnet(m[,5]~m[,1]+m[,2]+m[,3]+m[,4]+m[,6]+m[,7], data=m,hidden=3, err.fct="sse", linear.output=FALSE)
plot(nn)

#classification tree
library(rpart)
data_train[,1]<-as.factor(data_train[,1])
data_train[,2]<-as.factor(data_train[,2])
data_train[,4]<-as.factor(data_train[,4])
data_train[,5]<-as.factor(data_train[,5])
m4<-rpart(B1_S2~Main+Fund_TNA..k.+SIFI+Spread+Not_amt..k.+Big_bang,data =data_train,method = "class")
plot(m4,asp = 45,main = "Classification Tree")
text(m4,use.n = T, cex = 0.6)
print(m4)
pr<-predict(m4,data_test[,-5])
cl<-0*(pr[,1]>0.5)+1*(pr[,2]>0.5)
m4_classifi<-table(cl,data_test$B1_S2)

#misclassification tables
m1_classifi;m2_classifi;m3_classifi;m4_classifi
